sem-reg 0.1.1

Handle certain Windows registry binary values semantically. Extensible.
Documentation
impl ByteSeq {
    //...

    pub fn read_ansi_string(&mut self, len: Option<usize>) -> Result<Vec<u8>, ParseError> {
        //! Reads a zero-terminated ANSI string.

        if let Some((ansi_str, size)) = self.get_ansi_str(self.read_index, len) {
            let ansi_string = ansi_str.to_vec();
            self.read_index += size;
            Ok(ansi_string)
        } else {
            Err(ParseError::ExpectedAnsiString(self.read_index))
        }
    }

    fn get_ansi_str(&self, start_index: usize, mode: StringMode) -> Option<(&[u8], usize)> {
        let mut end_index = start_index;

        loop {
            if let Some(byte) = self.bytes.get(end_index) {
                //                 if *byte == 0 {
                //                     break if len.is_some() {
                //                         None
                //                     } else {
                //                         let slice = &self.bytes[start_index..end_index];
                //                         Some((slice, slice.len() + 1))
                //                     };
                //                 }
                //
                //                 if Some(end_index - start_index) == len {
                //                     let slice = &self.bytes[start_index..end_index];
                //                     break Some((slice, slice.len()));
                //                 }

                match mode {
                    StringMode::Len(len) => {
                        if *byte == 0 {
                            break None;
                        }

                        if end_index - start_index == len {
                            break Some((&self.bytes[start_index..end_index], len));
                        }

                        end_index += 1;
                    }
                    StringMode::TillZero => {
                        if *byte == 0 {
                            let slice = &self.bytes[start_index..end_index];
                            break Some((slice, slice.len() + 1));
                        } else {
                            //...
                        }
                    }
                    StringMode::TillZeroInSectionLen(len) => {
                        if *byte == 0 {
                            //...
                        } else {
                            //...
                        }
                    }
                }

                // end_index += 1;
            } else {
                break None;
            }
        }
    }

    pub fn push_ansi_str(&mut self, ansi_str: &[u8]) {
        self.bytes.extend_from_slice(ansi_str);
        self.bytes.push(0);
    }

    pub fn read_wide_string(&mut self, len: Option<usize>) -> Result<Vec<u16>, ParseError> {
        //! Reads a wide string, zero-terminated if `len` is `None`.

        if let Some((wide_string, size)) = self.get_wide_string(self.read_index, len) {
            self.read_index += size;
            Ok(wide_string)
        } else {
            Err(ParseError::ExpectedWideString(self.read_index))
        }
    }

    fn get_wide_string(&self, start_index: usize, len: Option<usize>) -> Option<(Vec<u16>, usize)> {
        let mut wide_string = Vec::new();
        let mut byte_index = start_index;

        loop {
            if let (Some(byte_1), Some(byte_2)) =
                (self.bytes.get(byte_index), self.bytes.get(byte_index + 1))
            {
                let wide_char = u16::from_le_bytes([*byte_1, *byte_2]);

                if wide_char == 0 {
                    break if len.is_some() {
                        None
                    } else {
                        Some((wide_string, (wide_string.len() + 1) * 2))
                    };
                }

                wide_string.push(wide_char);

                if Some(wide_string.len()) == len {
                    break Some((wide_string, wide_string.len() * 2));
                }

                byte_index += 2;
            } else {
                break None;
            }
        }
    }

    pub fn push_wide_string(&mut self, wide_string: &[u16]) {
        // Data is already in little endian.
        self.bytes.extend_from_slice(unsafe {
            slice::from_raw_parts(wide_string.as_ptr() as *const u8, wide_string.len() * 2)
        });
        self.bytes.extend_from_slice(&[0, 0]);
    }

    pub fn find_data(&self, reduced_amount: bool) -> Vec<Datum> {
        //! Helper for finding out the format or registry values. Tries to interpret the data at every index in a variety of formats. Imperfect and not thoroughly tested.
        //!
        //! Strings may also be length-prefixed, where the length may be a VLQ with possibly just one byte.

        let mut data = Vec::new();

        for i in 0..self.bytes.len() {
            let u_16 = self.get_int::<u16>(i);
            let u_32 = self.get_int::<u32>(i);
            let u_64 = self.get_int::<u64>(i);

            let mut must_push_u_16 = false;
            let mut must_push_u_32 = false;
            let mut must_push_u_64 = false;

            let vlq_64 = self.get_vlq_64(i).filter(|(_, size)| *size >= 2);

            let ascii_string = self
                .get_ansi_str(i)
                .filter(|(ansi_str, _)| {
                    ansi_str.len() >= 4 && ansi_str.iter().all(|&byte| byte >= 0x20 && byte < 0x80)
                })
                .map(|(ansi_str, size)| {
                    (
                        unsafe { str::from_utf8_unchecked(ansi_str) }.to_string(),
                        size,
                    )
                });
            let wide_string = self
                .get_wide_string(i)
                .filter(|(wide_string, _)| {
                    wide_string.len() >= 4
                        && wide_string
                            .iter()
                            .all(|&wide_char| wide_char >= 0x20 && wide_char < 0x2600)
                })
                .and_then(|(wide_string, size)| {
                    String::from_utf16(&wide_string)
                        .ok()
                        .map(|string| (string, size))
                });

            // Byte count.
            let bytes_left = self.bytes.len() - i;

            let mut byte_count = None;

            let mut set_byte_count =
                |int: Option<(usize, usize)>, extra_action: &mut dyn FnMut()| {
                    if let (None, Some((int, size))) = (byte_count, int) {
                        let bytes_left_all = bytes_left - size;
                        let bytes_left_less = bytes_left_all.saturating_sub(10);
                        byte_count = (int > 0 && int >= bytes_left_less && int <= bytes_left_all)
                            .then(|| {
                                extra_action();
                                int as _
                            });
                    }
                };

            set_byte_count(vlq_64.map(|(value, size)| (value as _, size)), &mut || {});
            set_byte_count(u_16.map(|(value, size)| (value as _, size)), &mut || {
                must_push_u_16 = true;
            });
            set_byte_count(u_32.map(|(value, size)| (value as _, size)), &mut || {
                must_push_u_32 = true;
            });

            // Timestamp.
            const MIN_EPOCH_SECS: u32 = 1420070400; // 2015-01-01T00:00Z
            let max_epoch_secs = chrono::Utc::now().timestamp() + 31536000; // Now plus one year.
            const MIN_FILETIME: u64 = 130645440000000000; // 2015-01-01T00:00Z
            let max_filetime = FileTime::now().filetime() as u64 + 315360000000000; // Now plus one year.

            let epoch_secs_from_u_32 = u_32.map(|(value, _)| value as u64).filter(|value| {
                *value >= MIN_EPOCH_SECS as _ && *value <= max_epoch_secs as _ && !reduced_amount
            });
            must_push_u_32 = must_push_u_32 || epoch_secs_from_u_32.is_some();

            let filetime_from_u_64 = u_64
                .map(|(value, _)| value)
                .filter(|value| *value >= MIN_FILETIME && *value <= max_filetime);
            must_push_u_64 = must_push_u_64 || filetime_from_u_64.is_some();

            let epoch_secs_from_vlq = vlq_64
                .map(|(value, _)| value)
                .filter(|value| *value >= MIN_EPOCH_SECS as _ && *value <= max_epoch_secs as _);
            let filetime_from_vlq = vlq_64
                .map(|(value, _)| value)
                .filter(|value| *value >= MIN_FILETIME && *value <= max_filetime);

            // Return data.
            if let Some(byte_count) = byte_count {
                data.push(Datum::ByteCount(byte_count, i));
            }

            for epoch_secs in [epoch_secs_from_u_32, epoch_secs_from_vlq] {
                if let Some(epoch_secs) = epoch_secs {
                    data.push(Datum::EpochSecsTimestamp(
                        utc_epoch_secs_to_local_iso_string(epoch_secs),
                        i,
                    ));
                }
            }

            for filetime in [filetime_from_u_64, filetime_from_vlq] {
                if let Some(filetime) = filetime {
                    data.push(Datum::FiletimeTimestamp(
                        utc_filetime_to_local_iso_string(filetime),
                        i,
                    ));
                }
            }

            if let (true, Some((value, _))) = (must_push_u_16, u_16) {
                data.push(Datum::U16(value, i));
            }

            if let (true, Some((value, _))) = (must_push_u_32, u_32) {
                data.push(Datum::U32(value, i));
            }

            if let (true, Some((value, _))) = (must_push_u_64, u_64) {
                data.push(Datum::U64(value, i));
            }

            if let Some((value, _)) = vlq_64 {
                data.push(Datum::Vlq64(value, Self::zigzag_64_decode(value as _), i));
            }

            if let Some((ascii_string, _)) = ascii_string {
                data.push(Datum::AsciiString(ascii_string, i));
            }

            if let Some((wide_string, _)) = wide_string {
                let mut must_push = true;
                for datum in data.iter() {
                    if let Datum::Utf16WideString(other_wide_string, _) = datum {
                        if other_wide_string.ends_with(&wide_string) {
                            must_push = false;
                        }
                    }
                }

                if must_push {
                    data.push(Datum::Utf16WideString(wide_string, i));
                }
            }
        }

        data
    }
}

pub enum StringMode {
    Len(usize),
    TillZero,
    TillZeroInSectionLen(usize),
}

/// A possible interpretation of raw bytes. All variants have the index at which the datum was found as their first tuple item.
#[derive(Debug)]
pub enum Datum {
    /// A number that also has another reasonable interpretation.
    U16(u16, usize),
    /// A number that also has another reasonable interpretation.
    U32(u32, usize),
    /// A number that also has another reasonable interpretation.
    U64(u64, usize),
    /// VLQ with at least 2 bytes. Additionally presented in zigzag-decoded form.
    Vlq64(u64, i64, usize),
    /// Possible number of bytes between the datum and the end.
    ByteCount(usize, usize),
    /// Only between years 2000 and 2100.
    EpochSecsTimestamp(String, usize),
    /// Only between years 2000 and 2100.
    FiletimeTimestamp(String, usize),
    /// Only code points > U+0020.
    AsciiString(String, usize),
    /// Only code points > U+0020.
    Utf16WideString(String, usize),
}

#[cfg(test)]
mod tests {
    use super::ByteSeq;

    #[ignore]
    #[test]
    fn tmp() {
        let byte_seq = ByteSeq::from_bytes(
            [
                0x43, 0x42, 0x01, 0x00, 0x0a, 0x02, 0x01, 0x00, 0x2a, 0x06, 0xb9, 0xdd, 0xae, 0xaa,
                0x06, 0x2a, 0x2b, 0x0e, 0xa1, 0x04, 0x43, 0x42, 0x01, 0x00, 0x12, 0x60, 0x7b, 0x00,
                0x39, 0x00, 0x30, 0x00, 0x31, 0x00, 0x32, 0x00, 0x30, 0x00, 0x30, 0x00, 0x30, 0x00,
                0x30, 0x00, 0x2d, 0x00, 0x30, 0x00, 0x30, 0x00, 0x31, 0x00, 0x37, 0x00, 0x2d, 0x00,
                0x30, 0x00, 0x30, 0x00, 0x30, 0x00, 0x30, 0x00, 0x2d, 0x00, 0x30, 0x00, 0x30, 0x00,
                0x30, 0x00, 0x30, 0x00, 0x2d, 0x00, 0x30, 0x00, 0x30, 0x00, 0x30, 0x00, 0x30, 0x00,
                0x30, 0x00, 0x30, 0x00, 0x30, 0x00, 0x46, 0x00, 0x46, 0x00, 0x31, 0x00, 0x43, 0x00,
                0x45, 0x00, 0x7d, 0x00, 0x5f, 0x00, 0x53, 0x00, 0x68, 0x00, 0x61, 0x00, 0x72, 0x00,
                0x65, 0x00, 0x50, 0x00, 0x6f, 0x00, 0x69, 0x00, 0x6e, 0x00, 0x74, 0x00, 0x44, 0x00,
                0x65, 0x00, 0x73, 0x00, 0x69, 0x00, 0x67, 0x00, 0x6e, 0x00, 0x65, 0x00, 0x72, 0x00,
                0x5f, 0x00, 0x7b, 0x00, 0x30, 0x00, 0x32, 0x00, 0x32, 0x00, 0x30, 0x00, 0x36, 0x00,
                0x44, 0x00, 0x43, 0x00, 0x43, 0x00, 0x2d, 0x00, 0x30, 0x00, 0x43, 0x00, 0x41, 0x00,
                0x46, 0x00, 0x2d, 0x00, 0x34, 0x00, 0x36, 0x00, 0x42, 0x00, 0x42, 0x00, 0x2d, 0x00,
                0x38, 0x00, 0x45, 0x00, 0x44, 0x00, 0x43, 0x00, 0x2d, 0x00, 0x36, 0x00, 0x43, 0x00,
                0x32, 0x00, 0x38, 0x00, 0x31, 0x00, 0x41, 0x00, 0x41, 0x00, 0x32, 0x00, 0x31, 0x00,
                0x45, 0x00, 0x46, 0x00, 0x41, 0x00, 0x7d, 0x00, 0x32, 0x0c, 0x45, 0x00, 0x78, 0x00,
                0x74, 0x00, 0x65, 0x00, 0x72, 0x00, 0x6e, 0x00, 0x61, 0x00, 0x6c, 0x00, 0x20, 0x00,
                0x4d, 0x00, 0x53, 0x00, 0x49, 0x00, 0x4a, 0x00, 0x6a, 0x00, 0x92, 0x42, 0x55, 0x00,
                0x70, 0x00, 0x64, 0x00, 0x61, 0x00, 0x74, 0x00, 0x65, 0x00, 0x20, 0x00, 0x66, 0x00,
                0x6f, 0x00, 0x72, 0x00, 0x20, 0x00, 0x4d, 0x00, 0x69, 0x00, 0x63, 0x00, 0x72, 0x00,
                0x6f, 0x00, 0x73, 0x00, 0x6f, 0x00, 0x66, 0x00, 0x74, 0x00, 0x20, 0x00, 0x4f, 0x00,
                0x66, 0x00, 0x66, 0x00, 0x69, 0x00, 0x63, 0x00, 0x65, 0x00, 0x20, 0x00, 0x32, 0x00,
                0x30, 0x00, 0x30, 0x00, 0x37, 0x00, 0x20, 0x00, 0x73, 0x00, 0x75, 0x00, 0x69, 0x00,
                0x74, 0x00, 0x65, 0x00, 0x73, 0x00, 0x20, 0x00, 0x28, 0x00, 0x4b, 0x00, 0x42, 0x00,
                0x32, 0x00, 0x35, 0x00, 0x39, 0x00, 0x36, 0x00, 0x37, 0x00, 0x38, 0x00, 0x37, 0x00,
                0x29, 0x00, 0x20, 0x00, 0x33, 0x00, 0x32, 0x00, 0x2d, 0x00, 0x42, 0x00, 0x69, 0x00,
                0x74, 0x00, 0x20, 0x00, 0x45, 0x00, 0x64, 0x00, 0x69, 0x00, 0x74, 0x00, 0x69, 0x00,
                0x6f, 0x00, 0x6e, 0x00, 0xb2, 0x09, 0x4d, 0x00, 0x69, 0x00, 0x63, 0x00, 0x72, 0x00,
                0x6f, 0x00, 0x73, 0x00, 0x6f, 0x00, 0x66, 0x00, 0x74, 0x00, 0xd2, 0x0a, 0x27, 0x68,
                0x00, 0x74, 0x00, 0x74, 0x00, 0x70, 0x00, 0x3a, 0x00, 0x2f, 0x00, 0x2f, 0x00, 0x73,
                0x00, 0x75, 0x00, 0x70, 0x00, 0x70, 0x00, 0x6f, 0x00, 0x72, 0x00, 0x74, 0x00, 0x2e,
                0x00, 0x6d, 0x00, 0x69, 0x00, 0x63, 0x00, 0x72, 0x00, 0x6f, 0x00, 0x73, 0x00, 0x6f,
                0x00, 0x66, 0x00, 0x74, 0x00, 0x2e, 0x00, 0x63, 0x00, 0x6f, 0x00, 0x6d, 0x00, 0x2f,
                0x00, 0x6b, 0x00, 0x62, 0x00, 0x2f, 0x00, 0x32, 0x00, 0x35, 0x00, 0x39, 0x00, 0x36,
                0x00, 0x37, 0x00, 0x38, 0x00, 0x37, 0x00, 0xd2, 0x0b, 0x27, 0x68, 0x00, 0x74, 0x00,
                0x74, 0x00, 0x70, 0x00, 0x3a, 0x00, 0x2f, 0x00, 0x2f, 0x00, 0x73, 0x00, 0x75, 0x00,
                0x70, 0x00, 0x70, 0x00, 0x6f, 0x00, 0x72, 0x00, 0x74, 0x00, 0x2e, 0x00, 0x6d, 0x00,
                0x69, 0x00, 0x63, 0x00, 0x72, 0x00, 0x6f, 0x00, 0x73, 0x00, 0x6f, 0x00, 0x66, 0x00,
                0x74, 0x00, 0x2e, 0x00, 0x63, 0x00, 0x6f, 0x00, 0x6d, 0x00, 0x2f, 0x00, 0x6b, 0x00,
                0x62, 0x00, 0x2f, 0x00, 0x32, 0x00, 0x35, 0x00, 0x39, 0x00, 0x36, 0x00, 0x37, 0x00,
                0x38, 0x00, 0x37, 0x00, 0x00, 0x00, 0x00, 0x00,
            ]
            .to_vec(),
        );
        // Same data: 43 42 01 00 0a 02 01 00 2a 06 b9 dd ae aa 06 2a 2b 0e a1 04 43 42 01 00 12 60 7b 00 39 00 30 00 31 00 32 00 30 00 30 00 30 00 30 00 2d 00 30 00 30 00 31 00 37 00 2d 00 30 00 30 00 30 00 30 00 2d 00 30 00 30 00 30 00 30 00 2d 00 30 00 30 00 30 00 30 00 30 00 30 00 30 00 46 00 46 00 31 00 43 00 45 00 7d 00 5f 00 53 00 68 00 61 00 72 00 65 00 50 00 6f 00 69 00 6e 00 74 00 44 00 65 00 73 00 69 00 67 00 6e 00 65 00 72 00 5f 00 7b 00 30 00 32 00 32 00 30 00 36 00 44 00 43 00 43 00 2d 00 30 00 43 00 41 00 46 00 2d 00 34 00 36 00 42 00 42 00 2d 00 38 00 45 00 44 00 43 00 2d 00 36 00 43 00 32 00 38 00 31 00 41 00 41 00 32 00 31 00 45 00 46 00 41 00 7d 00 32 0c 45 00 78 00 74 00 65 00 72 00 6e 00 61 00 6c 00 20 00 4d 00 53 00 49 00 4a 00 6a 00 92 42 55 00 70 00 64 00 61 00 74 00 65 00 20 00 66 00 6f 00 72 00 20 00 4d 00 69 00 63 00 72 00 6f 00 73 00 6f 00 66 00 74 00 20 00 4f 00 66 00 66 00 69 00 63 00 65 00 20 00 32 00 30 00 30 00 37 00 20 00 73 00 75 00 69 00 74 00 65 00 73 00 20 00 28 00 4b 00 42 00 32 00 35 00 39 00 36 00 37 00 38 00 37 00 29 00 20 00 33 00 32 00 2d 00 42 00 69 00 74 00 20 00 45 00 64 00 69 00 74 00 69 00 6f 00 6e 00 b2 09 4d 00 69 00 63 00 72 00 6f 00 73 00 6f 00 66 00 74 00 d2 0a 27 68 00 74 00 74 00 70 00 3a 00 2f 00 2f 00 73 00 75 00 70 00 70 00 6f 00 72 00 74 00 2e 00 6d 00 69 00 63 00 72 00 6f 00 73 00 6f 00 66 00 74 00 2e 00 63 00 6f 00 6d 00 2f 00 6b 00 62 00 2f 00 32 00 35 00 39 00 36 00 37 00 38 00 37 00 d2 0b 27 68 00 74 00 74 00 70 00 3a 00 2f 00 2f 00 73 00 75 00 70 00 70 00 6f 00 72 00 74 00 2e 00 6d 00 69 00 63 00 72 00 6f 00 73 00 6f 00 66 00 74 00 2e 00 63 00 6f 00 6d 00 2f 00 6b 00 62 00 2f 00 32 00 35 00 39 00 36 00 37 00 38 00 37 00 00 00 00 00

        for datum in byte_seq.find_data(true) {
            println!("{:?}", datum);
        }
    }

    //...
}

pub enum ParseError {
    //...
    /// Expected a null-terminated ANSI or ASCII string with 1 byte per character.
    #[error("expected an ANSI string at byte index {0}")]
    ExpectedAnsiString(usize),
    /// Expected a null-terminated wide string with 2 bytes per character (little endian).
    #[error("expected a wide string at byte index {0}")]
    ExpectedWideString(usize),
    //...
}